#' ---
#' title: "WVS Wave 7 Recoding"
#' author: "Gento Kato & Fan Lu"
#' date: "August 23, 2023"
#' ---
#' 
#' # Preparation 
#' 

## Clean Up Space
rm(list=ls())

## Set Working Directory (Automatically) ##
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)); 

## Data Directory: 
## ** This is a data location of RDS data of WVS Wave 7 downloaded from 
##    https://www.worldvaluessurvey.org/WVSDocumentationWV7.jsp
## ** If downloaded to the same folder as this file, set filedir <- "./"
filedir <- "./"
  
## Import Original Data
do <- readRDS(paste0(filedir,"WVS_Cross-National_Wave_7_Rds_v5_0.rds"))
do <- subset(do, B_COUNTRY_ALPHA=="JPN") # Japan

## Library Psych Package
require(psych)

#'
#' # Data Manipulation
#'

# Initiate New Data Set
d <- data.frame(id = do$D_INTERVIEW)

table(do$J_INTDATE, useNA="always")

#'
#' ## DEPENDENT variables of (potential) interest
#' 

#'
#' ### Increase in immigrants
#'

# Q130 (Total N=1353)
# Immigration policy preference
# How about people from other countries coming here to work. Which one of the following
# do you think the government should do?
# 1.- Let anyone come who wants to -> N=20
# 2.- Let people come as long as there are jobs available -> N=533
# 3.- Place strict limits on the number of foreigners who can come here -> N=708
# 4.- Prohibit people coming here from other countries -> N=11
# -1-.- Don´t know -> N=74
# -2-.- No answer -> N=7

tmp <- do$Q130
table(tmp, useNA="always")
sum(table(tmp, useNA="always"))

d$immigincrease <- (ifelse(tmp<0, NA, tmp)<3)*1
table(d$immigincrease, useNA="always")

#'
#' ### Education
#'

tmp <- do$Q275
table(tmp, useNA="always")
## 6=Bachelor, 7=Master, 8=Doctor

### Education (Ordinal)

# Recoded
d$edu <- ifelse(tmp<0, NA, ifelse(tmp%in%c(6,7,8), 3, ifelse(tmp%in%c(4,5),2,1)))
# Make it a Factor
d$edu <- factor(d$edu, labels = c("<=SHS",
                                  ">SHS & <College(4yr)",
                                  ">=College(4yr)"))
table(d$edu, useNA="always")

# Education Treatment 
d$edu2 <- ifelse(d$edu==">=College(4yr)",1,0)
d$edu2x <- d$edu2
d$edu2x[which(d$edu==">SHS & <College(4yr)")] <- NA
table(d$edu2, useNA="always")
table(d$edu2x, useNA="always")

#'
#' ### Age Cohort
#' 

tmp <- do$Q261
table(tmp, useNA="always")
d$bornyr <- tmp

## Academic Year of Entering College
# The survey was on Sep 1-30, so assume that they haven't turn to 19 yet.
d$univyr <-  d$bornyr + 18

# Recoded Cohort
## Cohort I (-1975 Expansion) 
## Cohort II (1975-1990 Stagnation) 
## Cohort III (1990-2000 Expansion) 
## Cohort IV (2000- Universal) 
d$cohort <- NA
d$cohort[which(1975 - d$bornyr>18)] <- 1
d$cohort[which(1975 - d$bornyr<=18 & 1990 - d$bornyr>18)] <- 2
d$cohort[which(1990 - d$bornyr<=18 & 2000 - d$bornyr>18)] <- 3
d$cohort[which(2000 - d$bornyr<=18 & 2010 - d$bornyr>18)] <- 4
d$cohort[which(2010 - d$bornyr<=18)] <- 5
## A factor variable
d$cohort <- factor(d$cohort, labels=c("Cohort I (18+ in -1974)",
                                      "Cohort II (18+ in 1975-1989)",
                                      "Cohort III (18+ in 1990-99)",
                                      "Cohort IV (18+ in 2000-09)",
                                      "Cohort V (18+ in 2010-)"))
table(d$cohort, useNA="always")

#'
#' ### Gender
#'

table(do$Q260,useNA="always")
d$female <- ifelse(do$Q260==2,1,0)
table(d$fem, useNA="always")

d$male <- 1 - d$female

#' 
#' ### Income
#' 

table(do$Q288, useNA="always")
# Original
tmp <- do$Q288
table(tmp, useNA="always")
# Recoded
## Percentile Conversion Function
convper <- function(old.var,missing.val){
  r <- old.var
  r[r %in% missing.val] <- NA
  rt <- cumsum(table(r)/sum(table(r))) # Cumulative Percentile
  rt <- rt - diff(c(0,rt))/2 # Take Midpoints 
  r <- rt[match(r, names(rt))]
  return(r)
}
d$income <- convper(tmp, c(-1,-2))
table(d$income, useNA="always")

d$incomecat <- NA
d$incomecat[which(d$income<=0.33)] <- "Low"
d$incomecat[which(d$income>0.33 & d$income<=0.67)] <- "Middle"
d$incomecat[which(d$income>0.67)] <- "High"
d$incomecat[which(tmp%in%c(-1,-2)|is.na(tmp))] <- "Missing"
d$incomecat <- factor(d$incomecat, levels=c("Low","Middle","High","Missing"))
table(d$incomecat, useNA="always") 

#'
#' # Jobs
#'

## Working Status
tmp <- do$Q279
table(tmp, useNA="always")

d$workstat <- ifelse(tmp%in%c(-2)|is.na(tmp),NA,
                     ifelse(tmp%in%c(1,3),"Self-Employed/Full-Time/Managerial",
                            ifelse(tmp%in%c(2,5,6),"Student/Housemaker/Part-Time",
                                   "Not Employed")))
d$workstat <- factor(d$workstat, 
                     levels=rev(c("Self-Employed/Full-Time/Managerial",
                                  "Student/Housemaker/Part-Time","Not Employed")))
table(d$workstat, useNA="always")

d$employed <- ifelse(d$workstat=="Not Employed",0,1)
table(d$employed)

#'
#' ### Marital Status
#'

## Marital/Kids Status
tmp <- do$Q273
table(tmp, useNA="always")

d$married <- ifelse(tmp%in%c(-2),NA,ifelse(tmp%in%c(1,2),1,0))
table(d$married)

#'
#' ### Urban-Rural (Settlement Size)
#'

table(do$G_TOWNSIZE,useNA="always") 
# 1=<2000,2=2000-5000,3=5000-10000,4=10000-20000,
# 5=20000-50000,6=50000-100000,7=100000-500000,8=>500000
tmp <- do$G_TOWNSIZE

d$urban <- (ifelse(do$G_TOWNSIZE<4,4,do$G_TOWNSIZE)-4)/4
table(d$urban)

#'
#' # Saving Data
#'

#+ eval=FALSE
saveRDS(d, "wvs7_japan_v1.rds")
